In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
In [2]:
sns.set(style='whitegrid')
In [3]:
products=pd.read_csv(r"../data/products_clean.csv")
In [4]:
df=pd.DataFrame(products)

TOP 5 HIGHEST RATED PRODUCTS

In [5]:
top_rated=df.sort_values(by='rating_rate',ascending=False).head(5)
print("Top 5 highest rated products are:")
print(top_rated[['title','rating_rate']])
Top 5 highest rated products are:
                                                title  rating_rate
10  Silicon Power 256GB SSD 3D NAND A55 SLC Cache ...          4.8
11  WD 4TB Gaming Drive Works with Playstation 4 P...          4.8
2                                  Mens Cotton Jacket          4.7
17        MBJ Women's Solid Short Sleeve Boat Neck V           4.7
4   John Hardy Women's Legends Naga Gold & Silver ...          4.6

TOP 5 MOST RATED PRODUCTS

In [6]:
top_most_rated=df.sort_values(by='rating_count',ascending=False).head(5)
print("Top 5 most rated products are:")
print(top_most_rated[['title','rating_count']])
Top 5 most rated products are:
                                                title  rating_count
16  Rain Jacket Women Windbreaker Striped Climbing...           679
2                                  Mens Cotton Jacket           500
9   SanDisk SSD PLUS 1TB Internal SSD - SATA III 6...           470
3                                Mens Casual Slim Fit           430
4   John Hardy Women's Legends Naga Gold & Silver ...           400

5 MOST EXPENSIVE PRODUCTS AND 5 MOST CHEAPTEST PRODUCTS

In [7]:
most_expensive=df.sort_values(by='price',ascending=False).head(5)
cheaptest=df.sort_values(by='price',ascending=True).head(5)
print("Top 5 most expensive products are:")
print(most_expensive[['title','price']])
print("\nTop 5 cheapest products are:")
print(top_rated[['title','price']])
Top 5 most expensive products are:
                                                title   price
13  Samsung 49-Inch CHG90 144Hz Curved Gaming Moni...  999.99
4   John Hardy Women's Legends Naga Gold & Silver ...  695.00
12  Acer SB220Q bi 21.5 inches Full HD (1920 x 108...  599.00
5                        Solid Gold Petite Micropave   168.00
11  WD 4TB Gaming Drive Works with Playstation 4 P...  114.00

Top 5 cheapest products are:
                                                title   price
10  Silicon Power 256GB SSD 3D NAND A55 SLC Cache ...  109.00
11  WD 4TB Gaming Drive Works with Playstation 4 P...  114.00
2                                  Mens Cotton Jacket   55.99
17        MBJ Women's Solid Short Sleeve Boat Neck V     9.85
4   John Hardy Women's Legends Naga Gold & Silver ...  695.00

NUMBER OF PRODUCTS PER CATEGORY

In [8]:
category_counts=df['category'].value_counts().reset_index()
category_counts.columns=['category','product_count']
fig=px.treemap(
    category_counts,
    path=['category'],
    values='product_count',
    color='product_count',
    color_continuous_scale='agsunset',
    title='Number of Products in Each Category'
)
fig.show()
fig.write_html("products_per_category.html")

AVERAGE PRICE PER PRODUCT CATEGORY

In [9]:
avg_price=df.groupby('category')['price'].mean().round(2).reset_index()
avg_price.rename(columns={'price':'avg_price'},inplace=True)
print("Average price per category is:")
print(avg_price)
Average price per category is:
           category  avg_price
0       electronics     332.50
1          jewelery     221.00
2    men's clothing      51.06
3  women's clothing      26.29
In [10]:
fig=px.bar(avg_price,x='category',y='avg_price',text='avg_price',color='category',title='Average Price Per Product Category')
fig.update_traces(texttemplate='%{text}',textposition='outside',hovertemplate='Category:%{x}<br>Avg Price:₹%{y}')
fig.update_layout(
    xaxis_title='Product Category',yaxis_title='Average Price',showlegend=False,uniformtext_minsize=8,uniformtext_mode='hide',
)
fig.show()
fig.write_html("avg_price_product.html")

Relationship Between Rating Count And Price(And their ratings)

In [11]:
plt.figure(figsize=(10,6))
scatter=sns.scatterplot(
    data=df,
    x='price',
    y='rating_count',
    hue='rating_rate',
    palette='coolwarm',
    size='rating_count',
    sizes=(20,200),
    alpha=0.7,
    edgecolor='gray'
)
sns.regplot(
    data=df,
    x='price',
    y='rating_count',
    scatter=False,
    color='blue',
    line_kws={"linewidth":2,"linestyle":"dashed"}
    
)
plt.title('Relationship between Rating Count & Price(colored by rating)')
plt.xlabel('Price')
plt.ylabel('Rating Count')
plt.legend(title='Ratings')
plt.tight_layout()
plt.show()
No description has been provided for this image

Interactive Graph for the above

In [12]:
fig = px.scatter(
    df,
    x='price',
    y='rating_count',
    color='rating_rate',
    size='rating_count',
    hover_data=['title', 'rating_rate', 'price', 'rating_count'],
    color_continuous_scale='Viridis',
    title='💬 Price vs Rating Count (Colored by Rating)'
)

fig.update_traces(marker=dict(opacity=0.7, line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(
    xaxis_title='Price (₹)',
    yaxis_title='Rating Count',
    coloraxis_colorbar=dict(title=' Rating'),
    legend_title='Rating'
)

fig.show()
fig.write_html("price_vs_rating_count.html")

CORRELATION BETWEEN PRICE AND RATING

In [13]:
correlation=df[['price','rating_rate','rating_count']].corr()
print("Correlation Matrix")
print(correlation)
Correlation Matrix
                 price  rating_rate  rating_count
price         1.000000    -0.154229     -0.108585
rating_rate  -0.154229     1.000000      0.117721
rating_count -0.108585     0.117721      1.000000

Impact of Category and Rating On Price

In [14]:
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect("product_data.db")

# Save DataFrame into SQLite
df.to_sql("products", conn, if_exists="replace", index=False)

# Close connection
conn.close()
In [ ]:
 
In [15]:
fig=px.scatter(
    df,
    x='rating_rate',
    y='price',
    color='rating_count',
    facet_col='category',
    facet_col_wrap=3,
    hover_data=['title','price','rating_rate','rating_count'],
    color_continuous_scale='Sunsetdark',
    title='Category+Rating Impact on Price',
    height=1000
)
fig.update_traces(marker=dict(size=7, opacity=0.7), selector=dict(mode='markers'))
fig.update_layout(coloraxis_colorbar=dict(title='Rating Count'), showlegend=False)
fig.update_xaxes(title='Rating')
fig.update_yaxes(title='Price (₹)')

fig.show()
fig.write_html("category_rating_vs_price.html")

Loading the dataframe to sql to run queries

In [16]:
conn = sqlite3.connect("product_data.db")

query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = conn.execute(query).fetchall()

print("Existing tables:", tables)

conn.close()
Existing tables: [('products',)]
In [17]:
conn = sqlite3.connect("product_data.db")

df_loaded = pd.read_sql("SELECT * FROM products LIMIT 5;", conn)
print(df_loaded)

conn.close()
   id                                              title   price  \
0   1  Fjallraven - Foldsack No. 1 Backpack, Fits 15 ...  109.95   
1   2             Mens Casual Premium Slim Fit T-Shirts    22.30   
2   3                                 Mens Cotton Jacket   55.99   
3   4                               Mens Casual Slim Fit   15.99   
4   5  John Hardy Women's Legends Naga Gold & Silver ...  695.00   

                                         description        category  \
0  Your perfect pack for everyday use and walks i...  men's clothing   
1  Slim-fitting style, contrast raglan long sleev...  men's clothing   
2  great outerwear jackets for Spring/Autumn/Wint...  men's clothing   
3  The color could be slightly different between ...  men's clothing   
4  From our Legends Collection, the Naga was insp...        jewelery   

   rating_rate  rating_count  
0          3.9           120  
1          4.1           259  
2          4.7           500  
3          2.1           430  
4          4.6           400  
In [18]:
conn = sqlite3.connect("product_data.db")

# Run SQL query (Modify this as needed)
query = "SELECT id,price FROM products WHERE price > 100 LIMIT 10;"
df_query_result = pd.read_sql(query, conn)

# Show result
print(df_query_result)

conn.close()
   id   price
0   1  109.95
1   5  695.00
2   6  168.00
3  10  109.00
4  11  109.00
5  12  114.00
6  13  599.00
7  14  999.99